# Course: BUAN 5210
# Title: Technical Appendix
# Purpose: Observe the effectiveness of in-store promotions and advertisements based on Basic EDA and detailed EDA
# Date: Feb 17th, 2019
# Author: Qianhui Guo, Hannah Khuong
# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
# load packages
library(tidyverse)
library(GGally)
library(gridExtra)
library(readr)
library(here)
library(janitor)
library(stringr)
library(knitr)
library(Hmisc)
library(kableExtra)
library(htmlTable)
library(car)
library(sjPlot)
product <- read_csv(here("mtp_product_data.csv"))
sales <- read_csv(here("mtp_sales_data.csv"))
# view data
str(product)
## Classes 'tbl_df', 'tbl' and 'data.frame': 114 obs. of 5 variables:
## $ UPC : chr "00-01-16000-11653" "00-01-16000-11945" "00-01-16000-14154" "00-01-16000-14156" ...
## $ brand : chr "GENERAL MILLS CINNAMON TST CR" "GENERAL MILLS CHEERIOS" "GENERAL MILLS CINNAMON TST CR" "GENERAL MILLS LUCKY CHARMS" ...
## $ flavor : chr "CINNAMON TOAST" "TOASTED" "CINNAMON TOAST" "TOASTED" ...
## $ volume : num 0.06 0.04 0.12 0.11 0.08 0.89 0.98 0.87 0.8 1.5 ...
## $ package: chr "BOX" "BOX" "CUP" "CUP" ...
## - attr(*, "spec")=
## .. cols(
## .. UPC = col_character(),
## .. brand = col_character(),
## .. flavor = col_character(),
## .. volume = col_double(),
## .. package = col_character()
## .. )
str(sales)
## Classes 'tbl_df', 'tbl' and 'data.frame': 21850 obs. of 7 variables:
## $ UPC : chr "01.16000.11653" "01.16000.11653" "01.16000.11653" "01.16000.11945" ...
## $ iri_key: num 644347 248741 535806 675634 205272 ...
## $ week : num 6 5 11 11 13 14 39 35 45 5 ...
## $ units : num 5 2 3 2 8 5 6 1 4 14 ...
## $ price : num 0.5 0.5 0.5 0.5 0.5 0.5 1.09 1.59 1.59 1 ...
## $ promo : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ad : chr "A" "NONE" "NONE" "NONE" ...
## - attr(*, "spec")=
## .. cols(
## .. UPC = col_character(),
## .. iri_key = col_double(),
## .. week = col_double(),
## .. units = col_double(),
## .. price = col_double(),
## .. promo = col_double(),
## .. ad = col_character()
## .. )
#Data Manipulation
# match UPC format
upc_pattern <- "([0][1-3][. -][1-8]{2}[0]{3}[. -][0-9]{5})" # define pattern for UPC
product <- product %>%
mutate(
UPC =str_extract_all(UPC,upc_pattern), # find out the same part
UPC = str_replace_all(UPC,'-','.') ) %>% # replace '-'with '.'
select(-package) # package is useless in this moment, so we don't take it into account
# left join
tidy_table <-left_join(sales,product) %>%
mutate(
# rename the category within the ad
ad = case_when(
ad == 'A' ~ 'Big Ads',
ad == 'B' ~ 'Medium/Small Ads',
ad == "NONE" ~ 'No Ads'
),
# convert character into factor
iri_key = as.factor(iri_key),
ad = as.factor(ad),
promo = as.factor(promo),
flavor = as.factor(flavor),
# calculate total price for each purchase
revenue = units * price,
# seperate the brand into producer and product name
producer = ifelse(str_detect(brand,"GENERAL MILLS"),"GENERAL MILLS",
ifelse(str_detect(brand,"KELLOGGS"),"KELLOGGS",
ifelse(str_detect(brand,"POST"),"POST",NA))),
producer = as.factor(producer)
) %>%
# abandon store number as well
select(-iri_key,-brand)
#Rename promo level
levels(tidy_table$promo) <- c("No", "Yes")
# descriptive statistics
summary(tidy_table)
## UPC week units price
## Length:21850 Min. : 1.00 Min. : 1.000 Min. :0.250
## Class :character 1st Qu.:14.00 1st Qu.: 3.000 1st Qu.:3.190
## Mode :character Median :27.00 Median : 7.000 Median :3.790
## Mean :26.62 Mean : 8.579 Mean :3.763
## 3rd Qu.:40.00 3rd Qu.:12.000 3rd Qu.:4.390
## Max. :52.00 Max. :28.000 Max. :9.990
## promo ad flavor
## No :17305 Big Ads : 1456 CINNAMON TOAST:1834
## Yes: 4545 Medium/Small Ads: 1061 COCOA :1901
## No Ads :19333 FRUIT :2192
## REGULAR :8816
## TOASTED :7107
##
## volume revenue producer
## Min. :0.040 Min. : 0.48 GENERAL MILLS: 7189
## 1st Qu.:0.750 1st Qu.: 11.80 KELLOGGS :12183
## Median :1.060 Median : 24.50 POST : 2478
## Mean :1.016 Mean : 31.01
## 3rd Qu.:1.120 3rd Qu.: 44.09
## Max. :4.000 Max. :155.48
#Create function for frequency tables
count_table <- function(x,colname){
x = enquo(x)
kable(
tidy_table %>%
tabyl(!!x) %>%
adorn_totals()%>%
adorn_pct_formatting(digits = 0 ),
digits = 2,
format = "html",
align = c("l","c","c"),
col.names = c(colname,"Count","Total")
)%>%
kable_styling(full_width = F)}
#Make count tables for univariate variables
count_table(promo,"Promotion")
| Promotion | Count | Total |
|---|---|---|
| No | 17305 | 79% |
| Yes | 4545 | 21% |
| Total | 21850 | 100% |
count_table(ad,"Advertisement")
| Advertisement | Count | Total |
|---|---|---|
| Big Ads | 1456 | 7% |
| Medium/Small Ads | 1061 | 5% |
| No Ads | 19333 | 88% |
| Total | 21850 | 100% |
count_table(flavor,"Flavor")
| Flavor | Count | Total |
|---|---|---|
| CINNAMON TOAST | 1834 | 8% |
| COCOA | 1901 | 9% |
| FRUIT | 2192 | 10% |
| REGULAR | 8816 | 40% |
| TOASTED | 7107 | 33% |
| Total | 21850 | 100% |
count_table(producer,"Producer")
| Producer | Count | Total |
|---|---|---|
| GENERAL MILLS | 7189 | 33% |
| KELLOGGS | 12183 | 56% |
| POST | 2478 | 11% |
| Total | 21850 | 100% |
#Functions for graphs
#Count bargraph
count_bargraph <- function(x) {
x + geom_bar(position = "dodge") +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
labs(y ="Count")
}
#Count histogram
count_hist<- function(x){
x + geom_histogram(bins = 52)+
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
labs(y ="Count")
}
#Make bar chart with values
bar_chart <- function(x){ x +
geom_bar(stat = "identity",position = "dodge") + theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
}
grid.arrange(
count_bargraph(ggplot(tidy_table, aes(promo))) +
xlab("Promotion"),
count_bargraph(ggplot(tidy_table, aes(ad))) +
xlab("Advertisement"),
count_bargraph(ggplot(tidy_table, aes(flavor))) +
xlab("Flavor")+
theme(axis.text.x = element_text(angle=60, hjust=1)),
count_bargraph(ggplot(tidy_table, aes(producer))) +
xlab("Producer") +
theme(axis.text.x = element_text(angle=60, hjust=1)),
nrow = 2)
# create histograms of continuous variables
grid.arrange(
count_hist(ggplot(tidy_table, aes(week))),
count_hist(ggplot(tidy_table, aes(price))),
count_hist(ggplot(tidy_table, aes(units))),
count_hist(ggplot(tidy_table, aes(revenue))),
count_hist(ggplot(tidy_table, aes(volume))),
nrow = 2
)
#Function for table
freq_table <- function(x, y, name, cols){
x = enquo(x)
y = enquo(y)
kable(
tidy_table %>%
tabyl(!!x, !!y) %>%
adorn_totals(where = c('row','col')) %>%
adorn_percentages(denominator = "all")%>%
adorn_pct_formatting(digits = 0 ),
digits = 2,
format = "html",
align = c("l","c","c","c","c"),
caption = name,
col.names = cols
)%>%
kable_styling(full_width = F)
}
# Create frequency tables (by percentage)
#Producer - Ads
freq_table(producer, ad, "Percent of Sales under Advertisement for Different Producers",
c("Producer", "Big Ads", "Medium/Small Ads", "No Ads", "Total"))
| Producer | Big Ads | Medium/Small Ads | No Ads | Total |
|---|---|---|---|---|
| GENERAL MILLS | 2% | 1% | 30% | 33% |
| KELLOGGS | 4% | 3% | 49% | 56% |
| POST | 1% | 1% | 10% | 11% |
| Total | 7% | 5% | 88% | 100% |
#Producer - Promotion
freq_table(producer, promo,
"Percent of Sales under Promotion for Different Producers",
c('Producer','No Promotion','With Promotion','Total'))
| Producer | No Promotion | With Promotion | Total |
|---|---|---|---|
| GENERAL MILLS | 27% | 6% | 33% |
| KELLOGGS | 43% | 12% | 56% |
| POST | 9% | 3% | 11% |
| Total | 79% | 21% | 100% |
# Create frequency tables (by percentage)
#Flavor - Advertisement
freq_table(flavor, ad, "Percent of Sales under Advertisement for Different Flavors",
c("Flavor", "Big Ads", "Medium/Small Ads", "No Ads", "Total"))
| Flavor | Big Ads | Medium/Small Ads | No Ads | Total |
|---|---|---|---|---|
| CINNAMON TOAST | 1% | 0% | 8% | 8% |
| COCOA | 1% | 0% | 8% | 9% |
| FRUIT | 1% | 0% | 9% | 10% |
| REGULAR | 2% | 2% | 36% | 40% |
| TOASTED | 2% | 2% | 28% | 33% |
| Total | 7% | 5% | 88% | 100% |
#Flavor - Promotion
freq_table(flavor, promo, "Percent of Sales under Promotion for Different Flavors",
c('Flavor','No Promotion','With Promotion','Total'))
| Flavor | No Promotion | With Promotion | Total |
|---|---|---|---|
| CINNAMON TOAST | 7% | 1% | 8% |
| COCOA | 6% | 2% | 9% |
| FRUIT | 8% | 2% | 10% |
| REGULAR | 32% | 8% | 40% |
| TOASTED | 26% | 7% | 33% |
| Total | 79% | 21% | 100% |
#Frequency Table
#Advertising - Promotion
freq_table(ad, promo,"Percent of Sales under Promotion and Advertising" ,c("Advertisement",'No Promotion','With Promotion','Total'))
| Advertisement | No Promotion | With Promotion | Total |
|---|---|---|---|
| Big Ads | 3% | 4% | 7% |
| Medium/Small Ads | 2% | 3% | 5% |
| No Ads | 74% | 14% | 88% |
| Total | 79% | 21% | 100% |
Create Heatmap to visualize number of sales
#Function for heatmap
heat_map <- function (x, y,y_title,x_title){
x<-enquo(x)
y<-enquo(y)
tidy_table %>%
group_by(!!x,!!y) %>%
summarise(count = n()) %>%
ggplot(aes(!!x,!!y)) +
geom_tile(aes(fill = -count))+
ylab(y_title) + xlab(x_title)+
scale_fill_continuous(guide = guide_legend(title = "Count"))
}
grid.arrange(
heat_map(producer,promo,"Promotion","Producer"),
heat_map(producer,ad,"advertisement","Producer"),
heat_map(flavor,promo,"Promotion","Flavor"),
heat_map(flavor,ad,"Advertisement","Flavor"),
heat_map(ad,promo,"Promotion","Advertisement"),
nrow=3)
# find out correlation between quantative via ggpairs()
tidy_table %>%
select(units,price,revenue,volume, week) %>% # select numerical variables
ggpairs()
# Make Scatter-plot for Volume - Price
tidy_table %>%
ggplot(aes(x = volume, y = price)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
grid.arrange(
count_bargraph(ggplot(tidy_table, aes(producer, fill = ad))) +
xlab("Producer"),
count_bargraph(ggplot(tidy_table, aes(producer, fill = promo))) +
xlab("Producer"),ncol=2)
bar_chart(
tidy_table %>%
select(producer,flavor,revenue) %>%
group_by(producer, flavor) %>%
summarise(
revenue = sum(revenue)
)%>%
ggplot(aes(x = producer, y = revenue, fill = flavor)))
bar_chart_math <- function(x, y, z, math, produc, ytitle, guide){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
tidy_table %>%
filter(producer == produc) %>%
ggplot(aes(x = reorder(!!x, !!y), y = !!y, fill = !!z))+
stat_summary( fun.y= math, geom="bar",position = "dodge") + theme_bw() + ylab(ytitle)+xlab("Flavors")+
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
scale_fill_brewer(palette = "Blues",guide = guide_legend(title = guide))
}
grid.arrange(
bar_chart_math(flavor, revenue, promo, "mean", "GENERAL MILLS", "Revenue", "Promotion")+
stat_summary(geom= "errorbar", fun.data = "mean_cl_normal", width = 0.4, position = position_dodge(0.9)) ,
bar_chart_math(flavor, units, promo, "median", "GENERAL MILLS", "Units sold", "Promotion"),nrow = 2, top = "Promotion on Revenue and Units sold")
grid.arrange(
bar_chart_math(flavor, revenue, ad, "mean", "GENERAL MILLS", "Revenue", "Advertisement")+
stat_summary(geom= "errorbar", fun.data = "mean_cl_normal", width = 0.4, position = position_dodge(0.9)),
bar_chart_math(flavor, units, ad, "median", "GENERAL MILLS", "Units sold", "Advertisement"), nrow = 2,
top = "Advertisement on Revenue and Units sold")
#Statistical testing
GM <- tidy_table%>%filter(producer == "GENERAL MILLS")
#Promotion
summary(aov(revenue[flavor == "TOASTED"]~promo[flavor == "TOASTED"], data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## promo[flavor == "TOASTED"] 1 1201 1201.4 1.528 0.217
## Residuals 3130 2461592 786.5
summary(aov(revenue[flavor == "COCOA"]~promo[flavor == "COCOA"], data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## promo[flavor == "COCOA"] 1 6162 6162 16.73 4.64e-05 ***
## Residuals 1018 374921 368
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(revenue[flavor == "CINNAMON TOAST"]~promo[flavor == "CINNAMON TOAST"], data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## promo[flavor == "CINNAMON TOAST"] 1 2913 2912.6 4.598 0.0321 *
## Residuals 1832 1160468 633.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(revenue[flavor == "REGULAR"]~promo[flavor == "REGULAR"], data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## promo[flavor == "REGULAR"] 1 45 44.9 0.089 0.766
## Residuals 1201 605711 504.3
#Ads
summary(aov(revenue[flavor == "TOASTED"]~ad[flavor == "TOASTED"], data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## ad[flavor == "TOASTED"] 2 831 415.5 0.528 0.59
## Residuals 3129 2461962 786.8
summary(aov(revenue[flavor == "COCOA"]~ad[flavor == "COCOA"], data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## ad[flavor == "COCOA"] 2 2951 1475.3 3.968 0.0192 *
## Residuals 1017 378133 371.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(revenue[flavor == "CINNAMON TOAST"]~ad[flavor == "CINNAMON TOAST"], data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## ad[flavor == "CINNAMON TOAST"] 2 937 468.3 0.738 0.478
## Residuals 1831 1162444 634.9
summary(aov(revenue[flavor == "REGULAR"]~ad[flavor == "REGULAR"], data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## ad[flavor == "REGULAR"] 2 93 46.4 0.092 0.912
## Residuals 1200 605663 504.7
Overall, Advertisement lead to an obvious increase in sale in all flavors for General Mills.
From Statistical analysis:
grid.arrange(
bar_chart_math(flavor, revenue, promo, "mean", "GENERAL MILLS", "Revenue", "Promotion")+
stat_summary(geom= "errorbar", fun.data = "mean_cl_normal", width = 0.4, position = position_dodge(0.9)),
bar_chart_math(flavor, revenue, promo, "sum", "GENERAL MILLS", "Revenue","Promotion"), nrow = 2)
b<- arrangeGrob(
bar_chart_math(flavor, revenue, promo, "mean", "GENERAL MILLS", "Revenue", "Promotion")+
stat_summary(geom= "errorbar", fun.data = "mean_cl_normal", width = 0.4, position = position_dodge(0.9)),
bar_chart_math(flavor, revenue, promo, "sum", "GENERAL MILLS", "Total Revenue","Promotion"), nrow = 2,
top = "Figure 2. Cocoa Flavor Generates the Lowest Revenue\nbut Yields the highest Promotion Premium")
ggsave("flavor.png", b)
## Saving 4 x 4 in image
People buy more cereal units if there is a promotion.
#Create graph function
line_graph<- function(x, y, z,math){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
tidy_table %>%
filter(producer == "GENERAL MILLS") %>%
ggplot(aes(x = !!x, y = !!y, color = !!z))+
stat_summary(fun.y = math , geom = "point") + stat_summary(fun.y = math, geom = "line") +
theme_bw() + theme(panel.border = element_blank())
}
grid.arrange(
#Graph time-series of units sold
line_graph(week, units, promo, "median" ), #Promotion
#Graph time-series of total revenue per sale
line_graph(week,revenue,promo, "mean"),#Promotion
nrow = 2)
grid.arrange(
#Graph time-series of units sold
line_graph(week, units, ad,"median"), #Ads
#Graph time-series of total revenue per sale
line_graph(week, revenue, ad, "mean"),
nrow = 2)
#Revenue
#Difference in revenue in with or without promotion program
t.test(tidy_table$revenue ~ tidy_table$promo)
##
## Welch Two Sample t-test
##
## data: tidy_table$revenue by tidy_table$promo
## t = -4.9152, df = 7766.8, p-value = 9.05e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.640645 -1.134898
## sample estimates:
## mean in group No mean in group Yes
## 30.61953 32.50730
#Difference in revenue in different advertisement program
summary(aov(tidy_table$revenue ~ tidy_table$ad))
## Df Sum Sq Mean Sq F value Pr(>F)
## tidy_table$ad 2 15769 7885 13.11 2.04e-06 ***
## Residuals 21847 13140695 601
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1,2))
boxplot(tidy_table$revenue ~ tidy_table$promo)
boxplot(tidy_table$revenue ~ tidy_table$ad)
#Units sold
#Difference in units sold in with or without promotion program
t.test(tidy_table$units ~ tidy_table$promo)
##
## Welch Two Sample t-test
##
## data: tidy_table$units by tidy_table$promo
## t = -27.59, df = 6398.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.540125 -3.070437
## sample estimates:
## mean in group No mean in group Yes
## 7.891419 11.196700
#Difference in units sold in different advertisement program
summary(aov(tidy_table$units ~ tidy_table$ad))
## Df Sum Sq Mean Sq F value Pr(>F)
## tidy_table$ad 2 16178 8089 183.1 <2e-16 ***
## Residuals 21847 965206 44
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1,2))
boxplot(tidy_table$units ~ tidy_table$promo)
boxplot(tidy_table$units ~ tidy_table$ad)
promotion_ad <- function(x,ylab,title, math) {
x <- enquo(x)
GM_table %>%
group_by(promo,ad) %>%
ggplot(aes(x =promo, y = !!x,fill=promo)) +
stat_summary(geom="bar", fun.y = math) +
facet_wrap(.~ad) +
ylab(ylab)+
xlab("Promotion") +
ggtitle(title) +
theme_bw() +
theme(
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
scale_fill_brewer(palette = "Blues",guide = guide_legend(title = "Promotions"))
}
GM_table <- tidy_table %>%
filter(producer == "GENERAL MILLS") %>%
select(week,promo,units,ad,revenue) %>%
group_by(week,promo,ad) %>%
summarise_all(mean)
GM_table[is.na(GM_table)] <- 0
g <- arrangeGrob(
promotion_ad(revenue,"Mean Revenue (Mean)"," ", mean)+ stat_summary(geom = "errorbar", fun.data = "mean_cl_normal", width=0.1),
promotion_ad(units,"Quantity (Median)"," ", median) ,
nrow =2, top = "Figure 1. Advertisements and Promotions have different effects\n on Revenue and Quantity Sold"
)
ggsave("promo-ad-interaction.png", g)
grid.arrange(
promotion_ad(revenue,"Mean Revenue (Mean)","Revenue varies by promotions and Ads", mean)+ stat_summary(geom = "errorbar", fun.data = "mean_cl_normal", width=0.1),
promotion_ad(units,"Quantity (Median)","Quantity varies by promotions and Ads", median) ,
nrow =2
)
Promotions stimulate customers to buy more products each time
Promotions lead to a slight increase in revenue
The effect of Medium/Small Ads are worse than the others in both quantity and revenue without a promotion.But with a promotion, Meium/Small Ads lead to more sale each time than that without ads.
#Statistical Testing - two-way ANOVA
#Revenue and ad-promo
summary(aov(revenue ~ ad + promo, data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## ad 2 1814 906.8 1.307 0.271
## promo 1 120 120.5 0.174 0.677
## Residuals 7185 4984697 693.8
#Revenue and ad-promo
summary(aov(revenue ~ promo, data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## promo 1 332 331.7 0.478 0.489
## Residuals 7187 4986299 693.8
summary(aov(units ~ ad + promo, data = GM))
## Df Sum Sq Mean Sq F value Pr(>F)
## ad 2 4049 2025 43.71 <2e-16 ***
## promo 1 9592 9592 207.10 <2e-16 ***
## Residuals 7185 332772 46
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- lm(revenue ~ producer + ad + promo + units + flavor + volume + ad*promo,data = tidy_table)
summary(model)
##
## Call:
## lm(formula = revenue ~ producer + ad + promo + units + flavor +
## volume + ad * promo, data = tidy_table)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.517 -4.083 -0.184 3.861 65.231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.779655 0.414412 -21.186 < 2e-16 ***
## producerKELLOGGS -2.237334 0.149455 -14.970 < 2e-16 ***
## producerPOST -4.236435 0.236793 -17.891 < 2e-16 ***
## adMedium/Small Ads -0.868366 0.522371 -1.662 0.0965 .
## adNo Ads 0.136095 0.325102 0.419 0.6755
## promoYes -12.173548 0.439064 -27.726 < 2e-16 ***
## units 3.364435 0.008857 379.866 < 2e-16 ***
## flavorCOCOA -0.007822 0.286071 -0.027 0.9782
## flavorFRUIT 0.690993 0.307475 2.247 0.0246 *
## flavorREGULAR -1.635735 0.250828 -6.521 7.12e-11 ***
## flavorTOASTED 2.701090 0.236352 11.428 < 2e-16 ***
## volume 13.803409 0.169107 81.625 < 2e-16 ***
## adMedium/Small Ads:promoYes 1.538483 0.683236 2.252 0.0243 *
## adNo Ads:promoYes 5.319945 0.466380 11.407 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.306 on 21836 degrees of freedom
## Multiple R-squared: 0.8855, Adjusted R-squared: 0.8854
## F-statistic: 1.299e+04 on 13 and 21836 DF, p-value: < 2.2e-16
# plot residuals to check for patterns
par(mfrow = c(2,3))
plot(tidy_table$producer,model$residuals)
plot(tidy_table$flavor,model$residuals)
plot(tidy_table$units,model$residuals)
plot(tidy_table$ad,model$residuals)
plot(tidy_table$volume,model$residuals)
model_diag <- plot_model(model, type ="diag")
model_diag[1]
## [[1]]
model_diag[2]
## [[1]]
model_diag[3]
## [[1]]
model_diag[4]
## [[1]]
Conclusion
This model satisfy the basic assumption of multiple linear regression model to be unbiased.
However, this model is heteroscedasticity. As x value increases, the residuals also increases.
# Pull out the coefficients and confidence interval for table and graph
coefficent <- summary(model)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coefficent[-1, ], confint(model)[-1, ])) # find and bind CI, remove Intercept
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE),],3))
| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| units | 3.364 | 0.009 | 379.866 | 0 | 3.347 | 3.382 |
| volume | 13.803 | 0.169 | 81.625 | 0 | 13.472 | 14.135 |
| promoYes | -12.174 | 0.439 | -27.726 | 0 | -13.034 | -11.313 |
| producerPOST | -4.236 | 0.237 | -17.891 | 0 | -4.701 | -3.772 |
| producerKELLOGGS | -2.237 | 0.149 | -14.97 | 0 | -2.53 | -1.944 |
| flavorTOASTED | 2.701 | 0.236 | 11.428 | 0 | 2.238 | 3.164 |
| adNo Ads:promoYes | 5.32 | 0.466 | 11.407 | 0 | 4.406 | 6.234 |
| flavorREGULAR | -1.636 | 0.251 | -6.521 | 0 | -2.127 | -1.144 |
| adMedium/Small Ads:promoYes | 1.538 | 0.683 | 2.252 | 0.024 | 0.199 | 2.878 |
| flavorFRUIT | 0.691 | 0.307 | 2.247 | 0.025 | 0.088 | 1.294 |
| adMedium/Small Ads | -0.868 | 0.522 | -1.662 | 0.096 | -1.892 | 0.156 |
| adNo Ads | 0.136 | 0.325 | 0.419 | 0.675 | -0.501 | 0.773 |
| flavorCOCOA | -0.008 | 0.286 | -0.027 | 0.978 | -0.569 | 0.553 |
Comments
The relationship of Revenue with Promotion, units, volume, producer, No ads, regular and toasted flavor are all statistical significant at 1% level of significance.
The relationship of revenue with fruit flavor is statistical significant at 5% level of significance.
Medium/Small Ads, No Ads, and Cocoa flavor does not have significant relationship with revenue.
ggplot(coe_CI,aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
geom_point() +
ylab("Variable") +
xlab("Coefficient with Confidence Interval") +
theme_bw()+
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI,color= "Blue") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$low_CI,color= "Blue") +
geom_vline(xintercept = 0, color= "Red")
Controlling for flavors, package volumes, units sold and advertisement, sales with promotion on average give companies $5.18 less than sales without promotion.
Controlling for flavors, package volumes, units sold and promotion, sales with no advertisement on average give companies $5.45 more than than sales with Big Ads.
Given that the price of a cereal unit is around $2 to $3, this gain or loss on a transaction is economically significant.
Buying pattern:
Most of the time, customers purchased cereals without advertisement (88%) or without promotions (79%).
Seventy-nine percent (79%) of the time, customers purchased cereals without any advertisement and promotions.
Flavor analysis:
Regular flavor is the best-seller on the market, while this is not a well-sold flavor for General Mills. This flavor also goes on discount the most frequently.
The biggest competitor for General Mills is Kellogg and their strength is Regular flavor cereals.
Compare sales with Promotions/Advertisement and without Promotions/Advertisement:
Promotion and advertisement make number of cereal units sold per week significantly higher than number of units sold without promotion or advertisement.
Promotion and advertisement also makes revenue per week statistically significantly higher than revenue without promotion or advertisement. However, there are more fluctuation in the advertisement condition versus no advertisement sales.
Multi-linear Regression Model:
# Save the rds file so I can reuse anything from this file in another file
save.image("mid.RData")
# recover using load()
load("mid.RData")